Purpose


Introduction

Setup

Begin by downloading and opening the following packages into your library. The gelnet library will be used to train the model, dplyr and gdata are used for data manipulation.

library(gelnet)
library(dplyr)
library(gdata)

Data

The the pcbc data is organized in a data frame with 99 samples as columns and 219 methylation probes as rows.

load("pcbc.data.Rda")
pcbc.data
load("pcbc.pd.f.Rda")
head(pcbc.pd.f)

Then find the mean center by subtracting the mean of each probe from the entire pcbc data. The mean of each probe just be in a numeric vector the same size as the number of probes, in this case 219.

m <- apply(pcbc.data, 1, mean)
m[1:5]
cg02927655 cg15948871 cg17676824 cg25552705 cg21434327 
 0.4130770  0.2652034  0.3821945  0.4168715  0.3776927 
pcbc.data.2 <- pcbc.data - m
pcbc.data.2

Identify stem cells and break up all samples into 2 groups:

  • Stem cell (‘X.tr’ object)
  • not stem cell (‘X.bk’ object)
# Define PCBC groups (SC and non.SC)
M1_smp <- pcbc.pd.f[pcbc.pd.f$Diffname_short %in% "SC",] #SC
M2_smp <- pcbc.pd.f[!(pcbc.pd.f$Diffname_short %in% "SC"),] #non-SC
# Select PCBC data
X.tr <- pcbc.data.2[, as.character(M1_smp$UID)] # 44 samples
X.bk <- pcbc.data.2[, as.character(M2_smp$UID)] # 55 samples

Now we can begin to train the the one-class model with the gelnet function. The gelnet function can be used for Linear Regression, Binary Classification and One class Problems by using an iterative method called coordinated descent (Sokolov et al. 2016).

gelnet(X, y, l1, l2)

It has four main arguments described below:

  • X: n by p matrix => transpose(‘X.r’)
  • y: ‘NULL’ for one class models
  • l1: coefficient for the L1-norm penalty => ‘0’
  • l2: coefficient for the L2-norm penalty => ‘1’

Make sure you transpose the matrix so that the genes are listed as rows and samples as columns. Then store the results as a tsv file (pcbc-stemsig.p219.rda).

## Train a one-class model
mm <- gelnet(t(X.tr), NULL, 0, 1) #NULL for a one-class task 
Training a one-class model
Iteration 1 : f = 0.6931472 
Iteration 2 : f = 0.3320004 
Iteration 3 : f = 0.3263175 
## Store the signature to a file
save( mm, file = "pcbc-stemsig.p219.Rda")

Leave One Out Cross-Validation

To test how the model’s performance by using leave one out cross-validation. This process has three steps:

  1. Train model on non-left-out data
  2. Score the left-out sample against the background
  3. AUC = P( left-out sample is scored above the background )
# Cross-validation with linear model:
# Perform leave-one-out cross-validation
auc <- c()
for(i in 1:ncol(X.tr)) {
  ## Train a model on non-left-out data
  X1 <- X.tr[,-i]
  X1 <- as.matrix(X1)
  K <- t(X1) %*% X1 / nrow(X1)
  m1 <- gelnet.ker(K, NULL, lambda = 1)
  w1 <- X1 %*% m1$v
  
  ## Score the left-out sample against the background
  X.bk <- X.bk[rownames(X.tr),]
  X.bk <- as.matrix(X.bk)
  s.bk <- t(w1) %*% X.bk
  s.bk <- unmatrix(s.bk)
  
  s1 <- t(w1) %*% X.tr[,i]
  s1 <- unmatrix(s1)
  
  ## AUC = P( left-out sample is scored above the background )
  auc[i] <- sum(s1 > s.bk) / length(s.bk)
  cat( "Current AUC: ", auc[i], "\n" )
  cat( "Average AUC: ", mean(auc), "\n" )
}

If the validation is successful you will notice that the auc variable will be a numeric vector consisting of only 1’s.

print(s1)
   r1:c1 
4.730943 
head(auc)
[1] 1 1 1 1 1 1
all(auc == 1)
[1] TRUE

Replace NA

The Replace NA function is used to replace any values that are NA (not available) with either the mean or the median value of the probe for a give group.

  1. check for NA values
  2. locate the NA values
  3. calculate the mean or median for the probe where each NA is found
  4. replace the values
replace.NA <-function(data,type.info,by = "mean"){
  if(!"group" %in% colnames(type.info)) stop("type.info must have group column")
  if(!"sample" %in% colnames(type.info)) stop("type.info must have a sample column")
  
  # Do we have NAs?
  if(is.na(table(is.na(data))["TRUE"])){
    message("No NAs were found")
    return(data)
  }
  # get NAs index 
  idx <- which(is.na(data) == TRUE,arr.ind=TRUE)
  count <- table(rownames(idx))
  message("======= Status Number of NA in probes ========")
  message("--------------------- Summary------------------")
  print(summary(as.numeric(count)))
  message("\n----------- Probes with more nb of NAs -----------")
  print(head(sort(count,decreasing = T)))
  message("===============================================")
                 
  idx <- cbind(idx, mean = NA, median = NA)
  
  # For each NA value calculate the mean for the same probe for the samples
  # where it belongs
  for(line in 1:nrow(idx)){
    row <- idx[line,1]
    col <- idx[line,2]
    probe <- rownames(idx)[line]
    sample <- colnames(data)[col]
    group <- type.info[type.info$sample == sample,"group"]
    samples.in.group <- type.info[type.info$group == group,]$sample
    
    # get the probe value for all samples in the same group 
    aux <- data[rownames(data) %in% probe, colnames(data) %in% samples.in.group] 
    
    idx[line,3] <- mean(as.numeric(aux),na.rm = TRUE)
    idx[line,4] <- median(as.numeric(aux),na.rm = TRUE)
  }
  # Step 2 replace
  for(line in 1:nrow(idx)){
    row <- idx[line,1]
    col <- idx[line,2]
    if(by == "mean"){
      data[idx[line,1],idx[line,2]] <- idx[line,3]  
    } else if(by == "median") { 
      data[idx[line,1],idx[line,2]] <- idx[line,4]
    }
  }
  return(data)
}

Score PanCan33 Data

Use the signature that was created and stored as ‘pcbc-stemsig.p219.rda’ to now score PanCan33 data. First load the data data.pan Replace empty values with the median probe values.

## Uses the signature to score PanCan33 data
# load TCGA 450K data (subset of 219 probes of interest)
load("data.pan.Rda") 
data.pan
load("type.info.Rda") #(contains tumor type info)
type.info
testset <- replace.NA(data.pan, type.info, by="median") 
======= Status Number of NA in probes ========
--------------------- Summary------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1.00    1.00    1.00    9.36    3.00  222.00 

----------- Probes with more nb of NAs -----------

cg04876127 cg10665848 cg20290850 cg01095506 cg05056545 cg15701612 
       222        101         35         15         11         10 
===============================================
head(testset)

Load the signature w which should be a numeric row vector the size of the probes of interest.

load("pcbc-stemsig.p219.Rda")
w <- mm$w
w[1:10]
 cg02927655  cg15948871  cg17676824  cg25552705  cg21434327  cg06678662  cg20300315  cg12257394  cg10381520  cg04193909 
-0.03251136 -0.03270579 -0.03151675 -0.04281638 -0.02921891 -0.03178537 -0.02974974 -0.05360807 -0.04052402 -0.04616716 
X <- testset[as.character(names(w)),]
head(X)

Convert X into a matrix

X <- as.matrix(X)
X[1:3,1:3]
           TCGA-05-4384-01A-01D-1756-05 TCGA-05-4390-01A-02D-1756-05 TCGA-05-4396-01A-21D-1856-05
cg02927655                    0.0978782                     0.094127                    0.0970064
cg15948871                    0.7006290                     0.675834                    0.7751590
cg17676824                    0.8089120                     0.815386                    0.8020920

Score via linear model. The resulting variable ss will be row vector 9627 in length

ss <- t(w) %*% X
ss[1,1:5]
TCGA-05-4384-01A-01D-1756-05 TCGA-05-4390-01A-02D-1756-05 TCGA-05-4396-01A-21D-1856-05 TCGA-05-4405-01A-21D-1856-05 
                   -4.813315                    -5.012777                    -4.969649                    -5.112913 
TCGA-05-4410-01A-21D-1856-05 
                   -4.933056 

Scale the svores into a ratio from 0 to 1. and store as data frame.

## Scale the scores to be between 0 and 1
ss <- ss - min(ss)
ss <- ss / max(ss)
ss <- as.data.frame(t(ss))
colnames(ss) <- "mDNAsi"   
head(ss)

Save scores to a Rda file.

save(ss, file = "TCGA_mDNAsi.Rda")

Continuous Code

# One class model - 450K data
# Files located @ https://drive.google.com/drive/folders/0BybEcxkBv6VqcmJ0QlA3bVJhc0E?usp=sharing

# Load required libraries
library(gelnet)
library(dplyr)
library(gdata)

# load PCBC metadata (contains group info)
load("pcbc.pd.f.Rda")
# load PCBC 450K data (subset of 219 probes of interest)
load("pcbc.data.Rda")

## Mean-center the data
m <- apply(pcbc.data, 1, mean )
pcbc.data.2 <- pcbc.data - m

# Define PCBC groups (SC and non.SC)
M1_smp <- pcbc.pd.f[pcbc.pd.f$Diffname_short %in% "SC",] #SC
M2_smp <- pcbc.pd.f[!(pcbc.pd.f$Diffname_short %in% "SC"),] #non-SC

# Select PCBC data
X.tr <- pcbc.data.2[, as.character(M1_smp$UID)] # 44 samples
X.bk <- pcbc.data.2[, as.character(M2_smp$UID)] # 55 samples

## Train a one-class model
mm <- gelnet(t(X.tr), NULL, 0, 1) # NULL for a one-class task 

## Store the signature to a file
save( mm, file = "pcbc-stemsig.p219.Rda")

# Cross-validation with linear model:
## Perform leave-one-out cross-validation
auc <- c()
for( i in 1:ncol(X.tr) )
{
  ## Train a model on non-left-out data
  X1 <- X.tr[,-i]
  X1 <- as.matrix(X1)
  K <- t(X1) %*% X1 / nrow(X1)
  m1 <- gelnet.ker(K, NULL, lambda = 1)
  w1 <- X1 %*% m1$v
  
  ## Score the left-out sample against the background
  X.bk <- X.bk[rownames(X.tr),]
  X.bk <- as.matrix(X.bk)
  s.bk <- t(w1) %*% X.bk
  s.bk <- unmatrix(s.bk)
  
  s1 <- t(w1) %*% X.tr[,i]
  s1 <- unmatrix(s1)
  
  ## AUC = P( left-out sample is scored above the background )
  auc[i] <- sum(s1 > s.bk) / length(s.bk)
  cat( "Current AUC: ", auc[i], "\n" )
  cat( "Average AUC: ", mean(auc), "\n" )
}

## Uses the signature to score PanCan33 data
# load TCGA 450K data (subset of 219 probes of interest)
load("data.pan.Rda") 
# Function to replace NA values with median of probe values by tumor type
source("replaceNA.R")
load("type.info.Rda") #(contains tumor type info)
testset <- replace.NA(data.pan, type.info, by = "median") 

## Load the signature
load("pcbc-stemsig.p219.Rda")
w <- mm$w

X <- testset[as.character(names(w)),]
X <- as.matrix(X)

## Score via linear model
ss <- t(w) %*% X
## Scale the scores to be between 0 and 1
ss <- ss - min(ss)
ss <- ss / max(ss)
ss <- as.data.frame(t(ss))

colnames(ss) <- "mDNAsi"   
save(ss, file = "TCGA_mDNAsi.Rda")

References

Sokolov, Artem, Daniel E Carlin, Evan O Paull, Robert Baertsch, and Joshua M Stuart. 2016. “Pathway-Based Genomics Prediction Using Generalized Elastic Net.” PLoS Comput Biol 12 (3). Public Library of Science: e1004790.

---
title: "One class model - 450K data"
author: "Tathiane Maistro Malta"
output:
  html_notebook:
    highlight: tango
    theme: journal
    toc: yes
    toc_float:
      collapsed: no
  html_document:
    toc: yes
editor_options:
  chunk_output_type: inline
bibliography: bibliography.bib
---
***
# Purpose 

***

# Introduction 

# Setup 

Begin by downloading and opening the following packages into your library.
The [gelnet](https://cran.r-project.org/web/packages/gelnet/index.html) library will be used to train the model, 
[dplyr](https://cran.r-project.org/web/packages/dplyr/index.html) and [gdata](https://cran.r-project.org/web/packages/gdata/index.html) are used for data manipulation.
```{r}
library(gelnet)
library(dplyr)
library(gdata)
```

# Data 

The the pcbc data is organized in a data frame with 99 samples as columns and 219 methylation probes as rows.
```{r}
load("pcbc.data.Rda")
pcbc.data
```

```{r}
load("pcbc.pd.f.Rda")
head(pcbc.pd.f)
```


Then find the mean center by subtracting the mean of each probe from the entire pcbc data. 
The mean of each probe just be in a numeric vector the same size as the number of probes, in this case 219.
```{r}
m <- apply(pcbc.data, 1, mean)
m[1:5]
```
```{r}
pcbc.data.2 <- pcbc.data - m
pcbc.data.2
```


Identify stem cells and break up all samples into 2 groups:

* Stem cell ('X.tr' object)
* not stem cell ('X.bk' object)

```{r}
# Define PCBC groups (SC and non.SC)
M1_smp <- pcbc.pd.f[pcbc.pd.f$Diffname_short %in% "SC",] #SC
M2_smp <- pcbc.pd.f[!(pcbc.pd.f$Diffname_short %in% "SC"),] #non-SC

# Select PCBC data
X.tr <- pcbc.data.2[, as.character(M1_smp$UID)] # 44 samples
X.bk <- pcbc.data.2[, as.character(M2_smp$UID)] # 55 samples
```

Now we can begin to train the the one-class model with the [gelnet](https://www.rdocumentation.org/packages/gelnet/versions/1.2.1/topics/gelnet) function. 
The gelnet function can be used for Linear Regression, 
Binary Classification and One class Problems by using an iterative method called 
coordinated descent [@sokolov2016pathway].

```{r, eval=FALSE}
gelnet(X, y, l1, l2)
```

It has four main arguments described below:

* **X**: n by p matrix => transpose('X.r')
* **y**: 'NULL' for one class models 
* **l1**: coefficient for the L1-norm penalty => '0' 
* **l2**: coefficient for the L2-norm penalty => '1'

Make sure you transpose the matrix so that the genes are listed as rows and samples as columns.
Then store the results as a tsv file (pcbc-stemsig.p219.rda). 
```{r}
## Train a one-class model
mm <- gelnet(t(X.tr), NULL, 0, 1) #NULL for a one-class task 

## Store the signature to a file
save( mm, file = "pcbc-stemsig.p219.Rda")
```



# Leave One Out Cross-Validation 

To test how the model's performance by using leave one out cross-validation. 
This process has three steps: 

1. Train model on non-left-out data
2. Score the left-out sample against the background
3. AUC = P( left-out sample is scored above the background )

```{r, results="hide"}
# Cross-validation with linear model:
# Perform leave-one-out cross-validation
auc <- c()
for(i in 1:ncol(X.tr)) {
  ## Train a model on non-left-out data
  X1 <- X.tr[,-i]
  X1 <- as.matrix(X1)
  K <- t(X1) %*% X1 / nrow(X1)
  m1 <- gelnet.ker(K, NULL, lambda = 1)
  w1 <- X1 %*% m1$v
  
  ## Score the left-out sample against the background
  X.bk <- X.bk[rownames(X.tr),]
  X.bk <- as.matrix(X.bk)
  s.bk <- t(w1) %*% X.bk
  s.bk <- unmatrix(s.bk)
  
  s1 <- t(w1) %*% X.tr[,i]
  s1 <- unmatrix(s1)
  
  ## AUC = P( left-out sample is scored above the background )
  auc[i] <- sum(s1 > s.bk) / length(s.bk)
  cat( "Current AUC: ", auc[i], "\n" )
  cat( "Average AUC: ", mean(auc), "\n" )
}
```


If the validation is successful you will notice that the auc variable will be a numeric vector consisting of only 1's.  
```{r}
print(s1)
```
```{r}
head(auc)
all(auc == 1)
```

# Replace NA 

The Replace NA function is used to replace any values that are NA (not available) 
with  either the mean or the median value of the probe for a give group. 

1. check for NA values
2. locate the NA values 
3. calculate the mean or median for the probe where each NA is found 
4. replace the values

```{r, message = FALSE, error = FALSE}
replace.NA <-function(data,type.info,by = "mean"){
  if(!"group" %in% colnames(type.info)) stop("type.info must have group column")
  if(!"sample" %in% colnames(type.info)) stop("type.info must have a sample column")
  
  # Do we have NAs?
  if(is.na(table(is.na(data))["TRUE"])){
    message("No NAs were found")
    return(data)
  }
  # get NAs index 
  idx <- which(is.na(data) == TRUE,arr.ind=TRUE)
  count <- table(rownames(idx))
  message("======= Status Number of NA in probes ========")
  message("--------------------- Summary------------------")
  print(summary(as.numeric(count)))
  message("\n----------- Probes with more nb of NAs -----------")
  print(head(sort(count,decreasing = T)))
  message("===============================================")
                 
  idx <- cbind(idx, mean = NA, median = NA)
  
  # For each NA value calculate the mean for the same probe for the samples
  # where it belongs
  for(line in 1:nrow(idx)){
    row <- idx[line,1]
    col <- idx[line,2]
    probe <- rownames(idx)[line]
    sample <- colnames(data)[col]
    group <- type.info[type.info$sample == sample,"group"]
    samples.in.group <- type.info[type.info$group == group,]$sample
    
    # get the probe value for all samples in the same group 
    aux <- data[rownames(data) %in% probe, colnames(data) %in% samples.in.group] 
    
    idx[line,3] <- mean(as.numeric(aux),na.rm = TRUE)
    idx[line,4] <- median(as.numeric(aux),na.rm = TRUE)
  }
  # Step 2 replace
  for(line in 1:nrow(idx)){
    row <- idx[line,1]
    col <- idx[line,2]
    if(by == "mean"){
      data[idx[line,1],idx[line,2]] <- idx[line,3]  
    } else if(by == "median") { 
      data[idx[line,1],idx[line,2]] <- idx[line,4]
    }
  }
  return(data)
}
```

# Score PanCan33 Data 

Use the signature that was created and stored as 'pcbc-stemsig.p219.rda' to now score PanCan33 data. 
First load the data data.pan
Replace empty values with the median probe values.

```{r}
## Uses the signature to score PanCan33 data
# load TCGA 450K data (subset of 219 probes of interest)
load("data.pan.Rda") 
data.pan
```
```{r}
load("type.info.Rda") #(contains tumor type info)
type.info
```
```{r}
testset <- replace.NA(data.pan, type.info, by="median") 
head(testset)
```


Load the signature w which should be a numeric row vector the size of the probes of interest. 
```{r}
load("pcbc-stemsig.p219.Rda")
w <- mm$w
w[1:10]
```


```{r}
X <- testset[as.character(names(w)),]
head(X)
```

Convert X into a matrix
```{r}
X <- as.matrix(X)
X[1:3,1:3]
```


Score via linear model. The resulting variable ss will be row vector 9627 in length 
```{r}
ss <- t(w) %*% X
ss[1,1:5]
```

Scale the svores into a ratio from 0 to 1. and store as data frame. 
```{r}
## Scale the scores to be between 0 and 1
ss <- ss - min(ss)
ss <- ss / max(ss)
ss <- as.data.frame(t(ss))
colnames(ss) <- "mDNAsi"   
head(ss)
```

Save scores to a Rda file.
```{r}
save(ss, file = "TCGA_mDNAsi.Rda")
```

# Continuous Code 

```{r,eval = FALSE}
# One class model - 450K data
# Files located @ https://drive.google.com/drive/folders/0BybEcxkBv6VqcmJ0QlA3bVJhc0E?usp=sharing

# Load required libraries
library(gelnet)
library(dplyr)
library(gdata)

# load PCBC metadata (contains group info)
load("pcbc.pd.f.Rda")
# load PCBC 450K data (subset of 219 probes of interest)
load("pcbc.data.Rda")

## Mean-center the data
m <- apply(pcbc.data, 1, mean )
pcbc.data.2 <- pcbc.data - m

# Define PCBC groups (SC and non.SC)
M1_smp <- pcbc.pd.f[pcbc.pd.f$Diffname_short %in% "SC",] #SC
M2_smp <- pcbc.pd.f[!(pcbc.pd.f$Diffname_short %in% "SC"),] #non-SC

# Select PCBC data
X.tr <- pcbc.data.2[, as.character(M1_smp$UID)] # 44 samples
X.bk <- pcbc.data.2[, as.character(M2_smp$UID)] # 55 samples

## Train a one-class model
mm <- gelnet(t(X.tr), NULL, 0, 1) # NULL for a one-class task 

## Store the signature to a file
save( mm, file = "pcbc-stemsig.p219.Rda")

# Cross-validation with linear model:
## Perform leave-one-out cross-validation
auc <- c()
for( i in 1:ncol(X.tr) )
{
  ## Train a model on non-left-out data
  X1 <- X.tr[,-i]
  X1 <- as.matrix(X1)
  K <- t(X1) %*% X1 / nrow(X1)
  m1 <- gelnet.ker(K, NULL, lambda = 1)
  w1 <- X1 %*% m1$v
  
  ## Score the left-out sample against the background
  X.bk <- X.bk[rownames(X.tr),]
  X.bk <- as.matrix(X.bk)
  s.bk <- t(w1) %*% X.bk
  s.bk <- unmatrix(s.bk)
  
  s1 <- t(w1) %*% X.tr[,i]
  s1 <- unmatrix(s1)
  
  ## AUC = P( left-out sample is scored above the background )
  auc[i] <- sum(s1 > s.bk) / length(s.bk)
  cat( "Current AUC: ", auc[i], "\n" )
  cat( "Average AUC: ", mean(auc), "\n" )
}

## Uses the signature to score PanCan33 data
# load TCGA 450K data (subset of 219 probes of interest)
load("data.pan.Rda") 
# Function to replace NA values with median of probe values by tumor type
source("replaceNA.R")
load("type.info.Rda") #(contains tumor type info)
testset <- replace.NA(data.pan, type.info, by = "median") 

## Load the signature
load("pcbc-stemsig.p219.Rda")
w <- mm$w

X <- testset[as.character(names(w)),]
X <- as.matrix(X)

## Score via linear model
ss <- t(w) %*% X
## Scale the scores to be between 0 and 1
ss <- ss - min(ss)
ss <- ss / max(ss)
ss <- as.data.frame(t(ss))

colnames(ss) <- "mDNAsi"   
save(ss, file = "TCGA_mDNAsi.Rda")
```

# References






